-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[X86] Lower mathlib call ldexp into scalef when avx512 is enabled #166839
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-x86 Author: Kavin Gnanapandithan (KavinTheG) ChangesResolve issue #165694. Patch is 49.73 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166839.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 133406bd8e0d7..f9e9bb26638d4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2590,6 +2590,26 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
}
+ if (Subtarget.hasAVX512()) {
+ for (MVT VT : { MVT::f32, MVT::f64, MVT::v16f32, MVT::v8f64})
+ setOperationAction(ISD::FLDEXP, VT, Custom);
+
+ if (Subtarget.hasVLX()) {
+ for (MVT VT : { MVT::v4f32, MVT::v2f64, MVT::v8f32, MVT::v4f64 })
+ setOperationAction(ISD::FLDEXP, VT, Custom);
+
+ if (Subtarget.hasFP16()) {
+ for (MVT VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16 })
+ setOperationAction(ISD::FLDEXP, VT, Custom);
+ }
+ }
+
+ if (Subtarget.hasFP16()) {
+ for (MVT VT : { MVT::f16, MVT::v32f16 })
+ setOperationAction(ISD::FLDEXP, VT, Custom);
+ }
+ }
+
// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
// is. We should promote the value to 64-bits to solve this.
// This is what the CRT headers do - `fmodf` is an inline header
@@ -19142,6 +19162,58 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
return SDValue();
}
+static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ SDValue X = Op.getOperand(0);
+ MVT XTy = X.getSimpleValueType();
+ SDValue Exp = Op.getOperand(1);
+ MVT XVT, ExpVT;
+
+ switch (XTy.SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::f16:
+ if (Subtarget.hasFP16()) {
+ XVT = Subtarget.hasVLX() ? MVT::v8f16 : MVT::v32f16;
+ ExpVT = XVT;
+ break;
+ }
+ X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
+ [[fallthrough]];
+ case MVT::f32:
+ XVT = MVT::v4f32;
+ ExpVT = MVT::v4f32;
+ break;
+ case MVT::f64:
+ XVT = MVT::v2f64;
+ ExpVT = MVT::v2f64;
+ break;
+ case MVT::v4f32:
+ case MVT::v2f64:
+ case MVT::v8f32:
+ case MVT::v4f64:
+ case MVT::v16f32:
+ case MVT::v8f64:
+ Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
+ return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X);
+ }
+
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp);
+ SDValue VX =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
+ SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
+ DAG.getUNDEF(ExpVT), Exp, Zero);
+ SDValue Scalef = DAG.getNode(X86ISD::SCALEFS, DL, XVT, VX, VExp, VX);
+ SDValue Final =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), Scalef, Zero);
+ if (X.getValueType() != XTy)
+ Final = DAG.getNode(ISD::FP_ROUND, DL, XTy, Final,
+ DAG.getIntPtrConstant(1, SDLoc(Op)));
+ return Final;
+}
+
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
@@ -33672,6 +33744,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
+ case ISD::FLDEXP: return LowerFLDEXP(Op, Subtarget, DAG);
// clang-format on
}
}
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 81529aff39ff1..499695f408396 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -79,38 +79,64 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX-LABEL: fmul_pow2_ldexp_4xfloat:
-; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: subq $40, %rsp
-; CHECK-AVX-NEXT: .cfi_def_cfa_offset 48
-; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX-NEXT: vextractps $1, %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf@PLT
-; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX-NEXT: vmovd %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf@PLT
-; CHECK-AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX-NEXT: vextractps $2, %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf@PLT
-; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX-NEXT: vextractps $3, %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf@PLT
-; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; CHECK-AVX-NEXT: addq $40, %rsp
-; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8
-; CHECK-AVX-NEXT: retq
+; CHECK-AVX2-LABEL: fmul_pow2_ldexp_4xfloat:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: subq $40, %rsp
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48
+; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-AVX2-NEXT: vextractps $1, %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf@PLT
+; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT: vmovd %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf@PLT
+; CHECK-AVX2-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT: vextractps $2, %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf@PLT
+; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT: vextractps $3, %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf@PLT
+; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-AVX2-NEXT: addq $40, %rsp
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_4xfloat:
+; CHECK-ONLY-AVX512F: # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm1
+; CHECK-ONLY-AVX512F-NEXT: vmovss {{.*#+}} xmm2 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm1, %xmm2, %xmm1
+; CHECK-ONLY-AVX512F-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1,1,1]
+; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm3, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm2, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-ONLY-AVX512F-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm3, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm2, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; CHECK-ONLY-AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm0, %xmm2, %xmm0
+; CHECK-ONLY-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-ONLY-AVX512F-NEXT: retq
+;
+; CHECK-SKX-LABEL: fmul_pow2_ldexp_4xfloat:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: vcvtdq2ps %xmm0, %xmm0
+; CHECK-SKX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0]
+; CHECK-SKX-NEXT: vscalefps %xmm0, %xmm1, %xmm0
+; CHECK-SKX-NEXT: retq
%r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, <4 x i32> %i)
ret <4 x float> %r
}
@@ -560,82 +586,109 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
; CHECK-AVX2-NEXT: retq
;
-; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf:
-; CHECK-AVX512F: # %bb.0:
-; CHECK-AVX512F-NEXT: subq $72, %rsp
-; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 80
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vpextrw $7, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $6, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $5, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $4, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $3, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $2, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $1, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0]
-; CHECK-AVX512F-NEXT: addq $72, %rsp
-; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 8
-; CHECK-AVX512F-NEXT: retq
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf:
+; CHECK-ONLY-AVX512F: # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT: vpextrw $7, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm2
+; CHECK-ONLY-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm2, %xmm1, %xmm2
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; CHECK-ONLY-AVX512F-NEXT: vpextrw $6, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm1, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; CHECK-ONLY-AVX512F-NEXT: vpextrw $5, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm1, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vpextrw $4, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm4, %xmm1, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; CHECK-ONLY-AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-ONLY-AVX512F-NEXT: vpextrw $3, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm1, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vpextrw $2, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm4, %xmm1, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; CHECK-ONLY-AVX512F-NEXT: vpextrw $1, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm4, %xmm1, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vmovd %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm0, %xmm1, %xmm0
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; CHECK-ONLY-AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; CHECK-ONLY-AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-ONLY-AVX512F-NEXT: retq
+;
+; CHECK-SKX-LABEL: fmul_pow2_ldexp_8xhalf:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: vpextrw $7, %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm1
+; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm2 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-SKX-NEXT: vscalefss %xmm1, %xmm2, %xmm1
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; CHECK-SKX-NEXT: vpextrw $6, %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-SKX-NEXT: vscalefss %xmm3, %xmm2, %xmm3
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; CHECK-SKX-NEXT: vpextrw $5, %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-SKX-NEXT: vscalefss %xmm3, %xmm2, %xmm3
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-SKX-NEXT: vpextrw $4, %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-SKX-NEXT: vscalefss %xmm4, %xmm2, %xmm4
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; CHECK-SKX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SKX-NEXT: vpextrw $3, %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-SKX-NEXT: vscalefss %xmm3, %xmm2, %xmm3
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-SKX-NEXT: vpextrw $2, %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-SKX-NEXT: vscalefss %xmm4, %xmm2, %xmm4
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; CHECK-SKX-NEXT: vpextrw $1, %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-SKX-NEXT: vscalefss %xmm4, %xmm2, %xmm4
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-SKX-NEXT: vmovd %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0
+; CHECK-SKX-NEXT: vscalefss %xmm0, %xmm2, %xmm0
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; CHECK-SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; CHECK-SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-SKX-NEXT: retq
%r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i)
ret <8 x half> %r
}
@@ -1769,3 +1822,5 @@ define x86_fp80 @pr128528(i1 %cond) {
%mul = fmul x86_fp80 %conv, 0xK4007D055555555555800
ret x86_fp80 %mul
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice - a few minors
| SDValue Zero = DAG.getConstant(0, DL, MVT::i64); | ||
| Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); | ||
| SDValue VX = | ||
| DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use DAG.getInsertVectorElt
|
| setOperationAction(ISD::FLDEXP, VT, Custom); | ||
|
|
||
| if (Subtarget.hasFP16()) { | ||
| for (MVT VT : {MVT::f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if we're extending on non-FP16 targets - why not drop the hasFP16 requirement and add it to the default AVX512 type list above? The only tricky one will be MVT::v32f16 which will need splitting / concatenating
I've added better half-vector coverage at #167294
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll try giving it a go.
| Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); | ||
| if (XTy.isVector()) { | ||
| SDValue WideX = DAG.getInsertSubvector(DL, DAG.getUNDEF(XVT), X, 0); | ||
| SDValue WideExp = DAG.getInsertSubvector(DL, DAG.getUNDEF(ExpVT), Exp, 0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You should be able to just use widenSubVector and tell it to widen to 512-bits instead of recomputing XVT/ExptVT
7f97af2 to
8649bce
Compare
|
| SDValue OpHigh = DAG.getNode(ISD::FLDEXP, DL, MVT::v16f16, High, ExpHigh); | ||
| SDValue ScaledLow = LowerFLDEXP(OpLow, Subtarget, DAG); | ||
| SDValue ScaledHigh = LowerFLDEXP(OpHigh, Subtarget, DAG); | ||
| return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v32f16, ScaledLow, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You might be able to use splitVectorOp here?
| case MVT::v2f64: | ||
| if (Subtarget.hasVLX()) { | ||
| Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); | ||
| return DAG.getNode(X86ISD::SCALEFS, DL, XTy, X, Exp, X); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why SCALEFS?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mistake on my part, should be SCALEF.
|
Quick question. Given a non-VLX target and vectors v4f32/v2f64, would it be better to return early in this case with SCALEFS instead of extending, since SCALEFS can work on xmm registers? Like so. |
|
No, SCALEFS will only touch element[0] - not the upper elements of the 128-bit vector |
|
Resolves #165694